tidy data

0.1 Simulated data

Notice the wide format

set.seed(1)

Sorg <- tibble(
  ID = seq(1:100),                                        # n = 100 
  ga_week = sample(13:43, 100, replace = TRUE),           # Gestational week
  gender = sample(c("Male", "Female"), 100, 
                  replace = TRUE, prob = c(0.4, 0.6)),    # More women
  t1_pg13_sum = rnorm(100, mean = 20, sd = 10),           # Values from a normal distribution
  t2_pg13_sum = rnorm(100, mean = t1_pg13_sum-5, sd = 5), # scores tend to drop 
  t3_pg13_sum = rnorm(100, mean = t2_pg13_sum-5, sd = 3), # scores tend to drop
) %>% 
  mutate(
    across(.cols = ends_with("_sum"),
           .fns = ~if_else(gender == "Female", .x+10, .x)),  # Women have higher scores
    across(.cols = ends_with("_sum"),
           .fns = ~ .x+ga_week/2)                            # Higer scores with higher GA week
  ) %>%
  
  # Create ga_cat on the basis of ga_week
  # ga_cat is a categorical variable
  mutate(
    ga_cat = case_when(
      ga_week < 22 ~ "14-21",
      ga_week < 37 ~ "22-36",
      ga_week < 42 ~ "37-42",
      ga_week >= 42 ~ "43+"
    ) %>%  factor(ordered = TRUE)
  )

0.2 Print

# A tibble: 100 × 7
      ID ga_week gender t1_pg13_sum t2_pg13_sum t3_pg13_sum ga_cat
   <int>   <int> <chr>        <dbl>       <dbl>       <dbl> <ord> 
 1     1      37 Female        40.7        31.3       30.7  37-42 
 2     2      16 Female        31.8        33.2       30.2  14-21 
 3     3      19 Male          30.0        27.9       28.8  14-21 
 4     4      13 Male          15.2        14.1        8.27 14-21 
 5     5      14 Female        42.8        45.6       36.8  14-21 
 6     6      41 Female        37.7        30.9       24.7  37-42 
 7     7      35 Female        63.8        62.8       58.1  22-36 
 8     8      23 Male          26.5        21.2       15.5  22-36 
 9     9      26 Female        59.8        52.3       46.0  22-36 
10    10      30 Male          30.9        30.5       20.8  22-36 
# ℹ 90 more rows

1 Plotting with a wide data structure

NOT RECOMMENDED!!!

1.1 Example 1

Sorg %>% 
  ggplot(aes(x =ga_week))+
  geom_point(aes(y = t1_pg13_sum))+
  geom_point(aes(y = t2_pg13_sum))+
  geom_point(aes(y = t3_pg13_sum))

1.2 Example 2

Sorg %>% 
  ggplot(aes(x =ga_week))+
  geom_point(aes(y = t1_pg13_sum), color = "red")+
  geom_point(aes(y = t2_pg13_sum), color = "blue")+
  geom_point(aes(y = t3_pg13_sum), color = "yellow")

1.3 Example 3

Sorg %>% 
  ggplot(aes(x =ga_week))+
  geom_point(aes(y = t1_pg13_sum), color = "red")+
  geom_point(aes(y = t2_pg13_sum), color = "blue")+
  geom_point(aes(y = t3_pg13_sum), color = "yellow")+
  facet_wrap(~gender)

2 Solution! Make the data longer (tidy) with pivot_longer()

2.1 pivot_longer()

Sorg_long <- Sorg %>% 
  pivot_longer(cols = ends_with("_sum"),
               names_to = "Tid",
               values_to = "Grief")
# A tibble: 300 × 6
      ID ga_week gender ga_cat Tid         Grief
   <int>   <int> <chr>  <ord>  <chr>       <dbl>
 1     1      37 Female 37-42  t1_pg13_sum  40.7
 2     1      37 Female 37-42  t2_pg13_sum  31.3
 3     1      37 Female 37-42  t3_pg13_sum  30.7
 4     2      16 Female 14-21  t1_pg13_sum  31.8
 5     2      16 Female 14-21  t2_pg13_sum  33.2
 6     2      16 Female 14-21  t3_pg13_sum  30.2
 7     3      19 Male   14-21  t1_pg13_sum  30.0
 8     3      19 Male   14-21  t2_pg13_sum  27.9
 9     3      19 Male   14-21  t3_pg13_sum  28.8
10     4      13 Male   14-21  t1_pg13_sum  15.2
# ℹ 290 more rows

3 Plotting with a tidy datastructure

3.1 Example 1

Sorg_long %>% 
  ggplot(aes(x = ga_week,
             y = Grief, 
             color = Tid)) +
  geom_point()+
  facet_grid(cols = vars(gender))

3.1 Example 1

3.2 Example 1

Sorg_long %>% 
  ggplot(aes(x = ga_week,
             y = Grief, 
             color = Tid)) +
  geom_point()+
  geom_smooth(method = "lm")+
  facet_grid(rows = vars(Tid),
             cols = vars(gender))

3.2 Example 1

3.3 Example 2

Sorg_long %>% 
  ggplot(aes(x = Tid,
             y = Grief,
             fill = ga_cat))+
  geom_boxplot(alpha = 0.5)+
  geom_point(alpha = 0.3) +
  geom_path(aes(group = ID), alpha = 0.3)+
  facet_grid(cols = vars(gender),
             rows = vars(ga_cat))

3.3 Example 2

3.4 Also see

Slides + excercises from Day 1 (Other important functions for data wrangling.)